BioInformatics and Machine Learning Intern

# Load the dplyr package
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Read an RDS file and assign it to a variable
df <- readRDS("exgr_test.rds")

# View the data
head(df)

Task 1: Counting Unique Transcripts

To find the count of unique transcripts, we identify the unique transcript IDs within the dataset. Each transcript is uniquely represented by its transcript ID

unique_transcripts <- length(unique(df$transcript_id))
unique_transcripts
## [1] 234485

As you can see there are 234485 transcripts in the entire datatset

##Task 2: Counting Unique Exons ### To determine the number of unique exons, we follow these steps: 1. Group the data by transcript IDs. 2. Calculate the count of exons within each transcript. 3. Sum up the counts from all transcripts to obtain the total count of unique exons.

exon_counts_per_transcript <- df %>%
  group_by(transcript_id) %>%
  summarize(group_count = n())

# View the result
print(exon_counts_per_transcript)
## # A tibble: 234,485 × 2
##    transcript_id group_count
##            <int>       <int>
##  1             1           3
##  2             2           6
##  3             3           3
##  4             4           2
##  5             5           1
##  6             6           1
##  7             7           3
##  8             8           1
##  9             9           3
## 10            10           1
## # … with 234,475 more rows
# Calculate the sum of group counts
total_unique_exons <- exon_counts_per_transcript %>%
  summarize(sum_group_count = sum(group_count))

print(total_unique_exons)
## # A tibble: 1 × 1
##   sum_group_count
##             <int>
## 1         1460986

There are 140986 total unique exons in the entire dataset.

Task 3: Calculating Average and Median Exon Length

To calculate the average and median length of an exon, we utilize the ‘mean’ and ‘median’ functions applied to the ‘width’ column, which represents the exon width.

# Calculate the mean (average) length
average_length <- mean(df$width)

# Calculate the median length
average_median <- median(df$width)

# Print the results
average_length
## [1] 262.9931
average_median
## [1] 130

Task 4: Calculating the length of introns

Inorder to calculate the length of introns we need to handle the positive strands and negative strands differently because

Length of intron = Start-co-ordinate of exon - end-co-ordinate of previous exon

For positive strands,the end coordinate of previous exon would be the end cordinate of the above row

For negative strands,the end coordinate of previos exon would be end coordinate of the row below

# Load necessary libraries
library(dplyr)
library(tidyr)

# Separate the DataFrame into positive and negative strand DataFrames
positive_strand_df <- df %>%
  filter(strand == '+')

negative_strand_df <- df %>%
  filter(strand == '-')

# Sort the positive strand DataFrame by 'transcript_id' and 'rank'
positive_strand_df <- positive_strand_df %>%
  arrange(transcript_id, rank)

# Calculate intron lengths for positive strand exons
positive_strand_df <- positive_strand_df %>%
  group_by(transcript_id) %>%
  mutate(intron_length = start - lag(end, default =NA ) - 1)

positive_strand_df$intron_length <- ifelse(is.na(positive_strand_df$intron_length), 0, positive_strand_df$intron_length)
positive_strand_df$intron_length <- as.integer(positive_strand_df$intron_length)



# Sort the negative strand DataFrame by 'transcript_id' and 'rank'
negative_strand_df <- negative_strand_df %>%
  arrange(transcript_id, rank)

# Calculate intron lengths for negative strand exons
negative_strand_df <- negative_strand_df %>%
  group_by(transcript_id) %>%
  mutate(intron_length = start - lead(end, default = NA)-1)

negative_strand_df$intron_length <- ifelse(is.na(negative_strand_df$intron_length), 0, negative_strand_df$intron_length)
negative_strand_df$intron_length <- as.integer(negative_strand_df$intron_length)

# Combine positive and negative strand DataFrames row-wise
combined_df <- bind_rows(positive_strand_df, negative_strand_df)

# Fill NA values in the 'intron_length' column with 0 and convert to integer
combined_df$intron_length <- ifelse(is.na(combined_df$intron_length), 0, combined_df$intron_length)
combined_df$intron_length <- as.integer(combined_df$intron_length)

# Sort the combined DataFrame by 'transcript_id' and 'rank'
combined_df <- combined_df %>%
  arrange(transcript_id, rank)

# Print the resulting DataFrame
print(combined_df)
## # A tibble: 1,460,986 × 11
## # Groups:   transcript_id [234,485]
##    transcript_id transc…¹ seque…² start   end width strand exon_id exon_…³  rank
##            <int> <chr>    <fct>   <int> <int> <int> <fct>    <int> <chr>   <int>
##  1             1 ENST000… chr1    11869 12227   359 +            1 ENSE00…     1
##  2             1 ENST000… chr1    12613 12721   109 +            5 ENSE00…     2
##  3             1 ENST000… chr1    13221 14409  1189 +            8 ENSE00…     3
##  4             2 ENST000… chr1    12010 12057    48 +            2 ENSE00…     1
##  5             2 ENST000… chr1    12179 12227    49 +            3 ENSE00…     2
##  6             2 ENST000… chr1    12613 12697    85 +            4 ENSE00…     3
##  7             2 ENST000… chr1    12975 13052    78 +            6 ENSE00…     4
##  8             2 ENST000… chr1    13221 13374   154 +            7 ENSE00…     5
##  9             2 ENST000… chr1    13453 13670   218 +            9 ENSE00…     6
## 10             3 ENST000… chr1    29554 30039   486 +           10 ENSE00…     1
## # … with 1,460,976 more rows, 1 more variable: intron_length <int>, and
## #   abbreviated variable names ¹​transcript_name, ²​sequence_names, ³​exon_name

Bonus Task - Calculate L1, L2, U1, U2 ensuring no overlap

Inorder to calculate l1, l2, u1, u2 for each exon(except the first and last exon) we need both the intron length on left side (that is the current intron length) and the intron length on right side (that is the intron length of next exon)

So first lets create a column which also stores the intron length of the next exon

As for calculating intron length we need to divide the dataset into 2 parts (one for positive strands and one for negative strands)

The same way we will divide it and calculate the next intron length

# Load necessary libraries
library(dplyr)

# For negative strand DataFrame:
# Calculate the previous intron length within each transcript group
negative_strand_df <- negative_strand_df %>%
  group_by(transcript_id) %>%
  mutate(next_intron_length = lag(intron_length, default = 0)) %>%
  ungroup() %>%
  mutate(next_intron_length = as.integer(next_intron_length))

# For positive strand DataFrame:
# Calculate the previous intron length within each transcript group
positive_strand_df <- positive_strand_df %>%
  group_by(transcript_id) %>%
  mutate(next_intron_length = lead(intron_length, default = 0)) %>%
  ungroup() %>%
  mutate(next_intron_length = as.integer(next_intron_length))


# Combine positive and negative strand DataFrames row-wise
complete_df <- bind_rows(positive_strand_df, negative_strand_df)

# Fill NA values in the 'intron_length' column with 0 and convert to integer
complete_df$intron_length <- ifelse(is.na(complete_df$intron_length), 0, complete_df$intron_length)
complete_df$intron_length <- as.integer(complete_df$intron_length)

# Sort the combined DataFrame by 'transcript_id' and 'rank'
complete_df <- complete_df %>%
  arrange(transcript_id, rank)

# Print the resulting DataFrame
print(complete_df)
## # A tibble: 1,460,986 × 12
##    transcript_id transc…¹ seque…² start   end width strand exon_id exon_…³  rank
##            <int> <chr>    <fct>   <int> <int> <int> <fct>    <int> <chr>   <int>
##  1             1 ENST000… chr1    11869 12227   359 +            1 ENSE00…     1
##  2             1 ENST000… chr1    12613 12721   109 +            5 ENSE00…     2
##  3             1 ENST000… chr1    13221 14409  1189 +            8 ENSE00…     3
##  4             2 ENST000… chr1    12010 12057    48 +            2 ENSE00…     1
##  5             2 ENST000… chr1    12179 12227    49 +            3 ENSE00…     2
##  6             2 ENST000… chr1    12613 12697    85 +            4 ENSE00…     3
##  7             2 ENST000… chr1    12975 13052    78 +            6 ENSE00…     4
##  8             2 ENST000… chr1    13221 13374   154 +            7 ENSE00…     5
##  9             2 ENST000… chr1    13453 13670   218 +            9 ENSE00…     6
## 10             3 ENST000… chr1    29554 30039   486 +           10 ENSE00…     1
## # … with 1,460,976 more rows, 2 more variables: intron_length <int>,
## #   next_intron_length <int>, and abbreviated variable names ¹​transcript_name,
## #   ²​sequence_names, ³​exon_name
#copying the df
copy_of_complete_dataframe <- complete_df

If the length, next length and width are greater than 200 then calculating them is much easier. We simply use the below formulas

  1. L1= start_coordinate - 100
  2. L2= start_cordinate + 100
  3. u1= end_cordinate - 100
  4. u2= end_cordinate + 100

Lets consider each one and understand what will happen if these (length, next length or width are greater than 200)

  1. L1

     According to the question L1 is 100 units of length before sj (start-cordinate) Hence L1 = start - 100
     If we take the first exon L1 would be zero Since transcripts start only with an exon there is no space for l1
     So l1 basicaly depends on the intron length If intron length is 0 then L1=0 I have add the code for this statement on
     line 14
     But if intron length is less than 200 then l1 will become start minus half of intron length L1=start-intron_length/2
  2. L2

     According to the question L2 is 100 units of length after sj (start-cordinate) Hence L2 = start + 100
     So l2 basicaly depends on the exon width
     If width less than 200 then l2 will become start plus half of width L2=start+exon_width/2
  3. u1

     According to the question u1 is 100 units of length before sj(end-cordinate) Hence u1 = end - 100
     So u1 also basicaly depends on the exon width
     If width less than 200 then u1 will become end minus half of exon width L2=end - exon_width/2
  4. u2 According to the question u2 is 100 units of length after sj(end-cordinate) Hence u2 = end +100 If we take the last exon u2 would be zero Since transcripts end only with an exon there is no space for u2 So u2 basicaly depends on the next intron length If next intron length is 0 then u2=0 I have add the code for the same But if next intron length is less than 200 then u2 will become end minus half of next intron length L2=start-intron_length/2

Incase if you want the dataframe to not contain values where L2=u1 then please uncomment the lines. It will result in the dataframe where L1 cannot be equal to u1

# Define a function to calculate positions
calculate_positions <- function(exon) {
  # Extract data from the exon
  start <- exon$start
  end <- exon$end
  length <- exon$intron_length
  width <- exon$width
  next_intron_length <- exon$next_intron_length

  # Define inner functions to calculate L1, L2, U1, and U2
  calculate_l1 <- function(start, length) {
    if (length <= 200) {
      if (length == 0) {
         l1 <- 0
      } else {
        if (length %% 2 == 0) {
          length <- length - 1
        }
        
        #    else { length <- length - 2} 
        l1 <- start - (length / 2)
      }
    } else {
      l1 <- start - 100
    }
    return(l1)
  }

  calculate_l2 <- function(start, width) {
    if (width <= 200) {
      if (width %% 2 == 0) {
        width <- width - 1
      } 
      #else {width <- width - 2} 
      l2 <- start + (width / 2)
    } else {
      l2 <- start + 100
    }
    return(l2)
  }

  calculate_u1 <- function(end, width) {
    if (width <= 200) {
      if (width %% 2 == 0) {
        width <- width - 1
      } 
        #else {width <- width - 2}  
      u1 <- end - (width / 2)
    } else {
      u1 <- end - 100
    }
    return(u1)
  }

  calculate_u2 <- function(end, next_intron_length) {
    if (next_intron_length != 0) {
      if (next_intron_length <= 200) {
        if (next_intron_length %% 2 == 0) {
          next_intron_length <- next_intron_length - 1
        }
          #else {next_intron_length <- next_intron_length - 2}
        u2 <- end + (next_intron_length / 2)
      } else {
        u2 <- end + 100
      }
    } else {
      u2 <- 0
    }
    return(u2)
  }

  # Calculate the positions
  l1 <- calculate_l1(start, length)
  l2 <- calculate_l2(start, width)
  u1 <- calculate_u1(end, width)
  u2 <- calculate_u2(end, next_intron_length)

  # Round the positions using floor and ceiling
  l1 <- ceiling(l1)
  l2 <- floor(l2)
  u1 <- ceiling(u1)
  u2 <- floor(u2)

  # Return the calculated positions
  return(data.frame(L1 = l1, L2 = l2, U1 = u1, U2 = u2))
}

# Apply the calculate_positions function to the DataFrame and create new columns
complete_df <- complete_df %>%
  rowwise() %>%
  do(calculate_positions(.)) %>%
  bind_cols(complete_df)

# Sort the combined DataFrame by 'transcript_id' and 'rank'
complete_df <- complete_df %>%
  arrange(transcript_id, rank)

complete_df

##Optimized code for Bonus problem

library(dplyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
# function which calculates the values of L1 L2 U1 U2
calculate_positions_vectorized <- function(df) {
  df <- df %>%
    mutate(
      half_intron_length = ifelse(intron_length == 0, 0, ifelse(intron_length %% 2 == 0, intron_length - 1, intron_length) / 2),
      half_exon_length = ifelse(width %% 2 == 0, width - 1, width) / 2,
      next_half_intron_length = ifelse(next_intron_length == 0, 0, ifelse(next_intron_length %% 2 == 0, next_intron_length - 1, next_intron_length) / 2),
      L1 = ifelse(intron_length == 0, 0, ceiling(start - pmin(100, half_intron_length))),
      L2 = floor(start + pmin(100, half_exon_length)),
      U1 = ceiling(end - pmin(100, half_exon_length)),
      U2 = ifelse(next_intron_length == 0, 0, floor(end + pmin(100, next_half_intron_length)))
    )
  
  return(df)
}

# Apply the calculate_positions_vectorized function to the data frame 'df'
final_df <- calculate_positions_vectorized(copy_of_complete_dataframe)

# Sort the combined data frame by 'transcript_id' and 'rank'
final_df <- final_df %>%
  arrange(transcript_id, rank)
# Create a list of column names to exclude
columns_to_exclude <- c("next_intron_length", "half_exon_length","half_intron_length","next_half_intron_length"
)

# Select only the columns that are NOT in the list of columns to exclude
filtered_df <- final_df[, !names(final_df) %in% columns_to_exclude]

filtered_df